In [1]:
import seaborn as sns
import pandas as pd
import plotly.express as px
import numpy as np
In [2]:
data = sns.load_dataset("tips")
data.head(5)
Out[2]:
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
In [3]:
# Goal: To train an AI to predict the tip from the total bill
In [4]:
features = data[['total_bill']]
tip = data['tip']
from sklearn import linear_model
f = linear_model.LinearRegression(fit_intercept = False) # create linear regression model
f.fit(features,tip)
Out[4]:
LinearRegression(fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [5]:
f.predict([[100]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Out[5]:
array([14.37318953])
In [6]:
f.predict([[70]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
Out[6]:
array([10.06123267])
In [7]:
# add predictions to dataframe for later plotting
data['prediction'] = f.predict(features) # prediction is the predicted tip.
In [8]:
data
Out[8]:
| total_bill | tip | sex | smoker | day | time | size | prediction | |
|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.442005 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 1.486188 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.019807 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 3.403571 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 3.534367 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 4.172537 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 3.906633 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 3.258402 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 2.561302 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 2.699285 |
244 rows × 8 columns
In [9]:
# summarize how good the model is
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['tip'],
mode = 'markers', name = 'actual'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction'],
mode = 'lines', name = 'predicted'))
fig.update_layout(font_size = 20)
# note: overlaying two plots on one pair of axes requires us to use this sort of code
In [10]:
# get coefficients and intercept from our sklearn model
f.coef_
Out[10]:
array([0.1437319])
In [11]:
f.intercept_
# we told the model intercept to be zero.
Out[11]:
0.0
In [12]:
# fit with intercept
f_w_int = linear_model.LinearRegression(fit_intercept = True)
f_w_int.fit(features,tip)
Out[12]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [13]:
f_w_int.predict([[100]])
# different from f.predict([[100]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Out[13]:
array([11.42272135])
In [14]:
f_w_int.predict([[0]])
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Out[14]:
array([0.92026961])
In [15]:
data['prediction_with_intercept'] = f_w_int.predict(data[['total_bill']])
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['tip'],
mode = 'markers', name = 'actual'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction'],
mode = 'lines', name = 'predicted(b=0)'))
fig.add_trace(go.Scatter(x=data['total_bill'], y = data['prediction_with_intercept'],
mode = 'lines', name = 'predicted(b != 0)'))
fig.update_layout(font_size = 20)
In [16]:
f_w_int.coef_
Out[16]:
array([0.10502452])
In [17]:
f_w_int.intercept_
Out[17]:
0.9202696135546731
In [18]:
# Our new model is predicting the tip is 0.1050*total_bill+0.92. In other works that humans tip
# 92 cents, then add a 10.5 percent tip on top.
Or in x/y notation, our two models are:
1. y = 0.1437x
2. y = 0.92 + 0.105x
Cell In[18], line 4 1. y = 0.1437x ^ SyntaxError: invalid decimal literal
In [19]:
# Loss Functions
In [20]:
# Computing the L2 Loss and MSE
In [21]:
data = sns.load_dataset('tips')
data['prediction'] = f.predict(data[['total_bill']])
data.head(5)
Out[21]:
| total_bill | tip | sex | smoker | day | time | size | prediction | |
|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.442005 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 1.486188 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.019807 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 3.403571 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 3.534367 |
In [22]:
# add loss to the dataframe
data['l2_loss'] = (data['tip'] - data['prediction']) ** 2
data.head(5)
Out[22]:
| total_bill | tip | sex | smoker | day | time | size | prediction | l2_loss | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.442005 | 2.050638 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 1.486188 | 0.030211 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.019807 | 0.230585 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 3.403571 | 0.008756 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 3.534367 | 0.005720 |
In [23]:
# compute the mean
np.mean(data['l2_loss'])
Out[23]:
1.1781161154513171
In [24]:
data['l2_loss'].mean()
Out[24]:
1.1781161154513171
In [25]:
# To calculate the mean squared error in practice, use the mean squared error function provided by sklean.metrics
In [26]:
## or we can compute .MSE directly from the outcome and predictions using mean_squared_error
from sklearn.metrics import mean_squared_error
mean_squared_error(data['tip'],f.predict(data[['total_bill']]))
Out[26]:
1.1781161154513171
In [27]:
mean_squared_error(data['tip'],data['prediction'])
Out[27]:
1.1781161154513171
In [28]:
# Understanding that MSE is a Function of Oe Variable (Theta)
In [29]:
data
Out[29]:
| total_bill | tip | sex | smoker | day | time | size | prediction | l2_loss | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.442005 | 2.050638 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 1.486188 | 0.030211 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.019807 | 0.230585 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 3.403571 | 0.008756 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 3.534367 | 0.005720 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 4.172537 | 3.053627 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 3.906633 | 3.635249 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 3.258402 | 1.583576 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 2.561302 | 0.658212 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 2.699285 | 0.090430 |
244 rows × 9 columns
In [30]:
# replace the prediction column and L2 loss columns with the corresponding values for 20% tip
data['prediction'] = data['total_bill'] * 0.2
data['l2_loss'] = (data['prediction'] - data['tip'])**2
data
Out[30]:
| total_bill | tip | sex | smoker | day | time | size | prediction | l2_loss | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 3.398 | 5.702544 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 2.068 | 0.166464 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 4.202 | 0.492804 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 4.736 | 2.033476 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 4.918 | 1.710864 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 5.806 | 0.012996 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 5.436 | 11.806096 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 4.534 | 6.421156 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 3.564 | 3.290596 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 3.756 | 0.571536 |
244 rows × 9 columns
In [31]:
# compute the mean_squared_error for 20% tip
mean_squared_error(data['prediction'], data['tip'])
Out[31]:
2.667486278688525
In [32]:
# computer error for a 20% tip on one line (without creating prediction and l2 loss column)
mean_squared_error(data['total_bill'] * 0.2, data['tip'])
Out[32]:
2.667486278688525
In [33]:
mean_squared_error(data['total_bill'] * 0.08, data['tip'])
Out[33]:
3.08881276852459
In [34]:
mean_squared_error(data['total_bill'] * 0.3, data['tip'])
Out[34]:
12.66543732377049
In [35]:
# define a function mse_given_theta as a function which takes theto
# and compute MSE
def mse_given_theta(theta):
return mean_squared_error(data['total_bill'] * theta, data['tip'])
In [36]:
mse_given_theta(0.3)
Out[36]:
12.66543732377049
In [37]:
mse_given_theta(0.2)
Out[37]:
2.667486278688525
In [38]:
# create a list of thetas
thetas = np.linspace(0.1,0.2,100)
thetas
Out[38]:
array([0.1 , 0.1010101 , 0.1020202 , 0.1030303 , 0.1040404 ,
0.10505051, 0.10606061, 0.10707071, 0.10808081, 0.10909091,
0.11010101, 0.11111111, 0.11212121, 0.11313131, 0.11414141,
0.11515152, 0.11616162, 0.11717172, 0.11818182, 0.11919192,
0.12020202, 0.12121212, 0.12222222, 0.12323232, 0.12424242,
0.12525253, 0.12626263, 0.12727273, 0.12828283, 0.12929293,
0.13030303, 0.13131313, 0.13232323, 0.13333333, 0.13434343,
0.13535354, 0.13636364, 0.13737374, 0.13838384, 0.13939394,
0.14040404, 0.14141414, 0.14242424, 0.14343434, 0.14444444,
0.14545455, 0.14646465, 0.14747475, 0.14848485, 0.14949495,
0.15050505, 0.15151515, 0.15252525, 0.15353535, 0.15454545,
0.15555556, 0.15656566, 0.15757576, 0.15858586, 0.15959596,
0.16060606, 0.16161616, 0.16262626, 0.16363636, 0.16464646,
0.16565657, 0.16666667, 0.16767677, 0.16868687, 0.16969697,
0.17070707, 0.17171717, 0.17272727, 0.17373737, 0.17474747,
0.17575758, 0.17676768, 0.17777778, 0.17878788, 0.17979798,
0.18080808, 0.18181818, 0.18282828, 0.18383838, 0.18484848,
0.18585859, 0.18686869, 0.18787879, 0.18888889, 0.18989899,
0.19090909, 0.19191919, 0.19292929, 0.19393939, 0.19494949,
0.1959596 , 0.1969697 , 0.1979798 , 0.1989899 , 0.2 ])
In [39]:
# compute MSEs for those thetas
mses = [mse_given_theta(theta) for theta in thetas]
mses
Out[39]:
[2.0777683729508194, 2.0366887534058913, 1.996569059699077, 1.9574092918303747, 1.919209449799786, 1.8819695336073097, 1.8456895432529465, 1.8103694787366964, 1.7760093400585586, 1.7426091272185338, 1.7101688402166224, 1.678688479052823, 1.6481680437271375, 1.6186075342395636, 1.5900069505901033, 1.5623662927787565, 1.5356855608055218, 1.5099647546704, 1.4852038743733909, 1.461402919914495, 1.4385618912937121, 1.4166807885110417, 1.3957596115664843, 1.37579836046004, 1.3567970351917082, 1.3387556357614898, 1.3216741621693837, 1.3055526144153906, 1.2903909924995107, 1.2761892964217436, 1.262947526182089, 1.2506656817805475, 1.2393437632171185, 1.2289817704918033, 1.2195797036046003, 1.2111375625555103, 1.2036553473445333, 1.197133057971669, 1.1915706944369175, 1.1869682567402793, 1.1833257448817533, 1.1806431588613406, 1.1789204986790405, 1.1781577643348538, 1.1783549558287796, 1.1795120731608182, 1.18162911633097, 1.1847060853392344, 1.1887429801856118, 1.193739800870102, 1.1996965473927048, 1.206613219753421, 1.21448981795225, 1.2233263419891915, 1.233122791864246, 1.2438791675774135, 1.255595469128694, 1.2682716965180871, 1.2819078497455934, 1.2965039288112117, 1.312059933714944, 1.3285758644567889, 1.3460517210367458, 1.3644875034548163, 1.3838832117109996, 1.4042388458052961, 1.4255544057377052, 1.4478298915082273, 1.471065303116862, 1.4952606405636095, 1.5204159038484695, 1.5465310929714433, 1.5736062079325297, 1.6016412487317289, 1.630636215369041, 1.6605911078444662, 1.6915059261580032, 1.7233806703096541, 1.7562153402994167, 1.7900099361272936, 1.824764457793283, 1.8604789052973851, 1.8971532786396006, 1.9347875778199284, 1.97338180283837, 2.0129359536949227, 2.053450030389589, 2.0949240329223673, 2.13735796129326, 2.1807518155022656, 2.225105595549384, 2.270419301434615, 2.316692933157959, 2.3639264907194146, 2.4121199741189843, 2.461273383356666, 2.5113867184324614, 2.56245997934637, 2.614493166098391, 2.667486278688525]
In [40]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
xaxis_title = 'theta',
yaxis_title = 'MSE',
font_size = 20
)
In [41]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
xaxis_title = r'$\theta',
yaxis_title = 'MSE',
font_size = 20
)
In [42]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
xaxis_title = r'$\huge(\theta)$',
yaxis_title = 'MSE',
font_size = 20
)
# The Plotly library supports LaTeX syntax
In [43]:
# to find the best theta, make a plot
# plot MSE vs theta, showing off how to set xaxis_title and yaxis_title
fig = px.line(x = thetas, y = mses)
fig.update_layout(
xaxis_title = 'θ',
yaxis_title = 'MSE',
font_size = 20
)
# search from google, copy and paste
In [44]:
# The minimum mean squared error happens right where the scikit-learn model picked Theta.
In [45]:
# Using SciPy Optimize to Optimize L2 Loss
In [46]:
import scipy.optimize
In [47]:
# define g as the cube of x plus square of x minus 3 times x plus 2
def g(x):
return x**3 + x**2 - 3*x + 2
In [48]:
g(12)
Out[48]:
1838
In [49]:
# use scipy.optimze. minimize and compare with wolfram alpha
scipy.optimize.minimize(g, x0 = 1000)
Out[49]:
message: Optimization terminated successfully.
success: True
status: 0
fun: 0.7316461776530541
x: [ 7.208e-01]
nit: 20
jac: [ 1.788e-07]
hess_inv: [[ 1.580e-01]]
nfev: 48
njev: 24
In [50]:
#visualize g
px.line(x = np.linspace(-3,2,100), y =g(np.linspace(-3,2,100)))
# When the function is plotted, we see that the minimizing value is around 0.72.
In [51]:
scipy.optimize.minimize(mse_given_theta, x0 = 0.2)
Out[51]:
message: Optimization terminated successfully.
success: True
status: 0
fun: 1.1781161154513287
x: [ 1.437e-01]
nit: 1
jac: [ 2.384e-06]
hess_inv: [[1]]
nfev: 6
njev: 3
In [52]:
# There are many minimization libraries that use various types of numerical techniques
In [53]:
# This minimization library can fail.
scipy.optimize.minimize(g, x0 = -3)
# The success flag comes up as False
# There is no true absolute minimum for this function
# So it will be important to pick a loss function that has a nice shape for optimization and a minimum
Out[53]:
message: Desired error not necessarily achieved due to precision loss.
success: False
status: 2
fun: -1114853117.349824
x: [-1.037e+03]
nit: 1
jac: [ 3.226e+06]
hess_inv: [[-3.206e-04]]
nfev: 236
njev: 112
In [54]:
# Multiple Linear Regression
In [55]:
data
Out[55]:
| total_bill | tip | sex | smoker | day | time | size | prediction | l2_loss | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 3.398 | 5.702544 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 2.068 | 0.166464 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 4.202 | 0.492804 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 4.736 | 2.033476 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 4.918 | 1.710864 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 5.806 | 0.012996 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 5.436 | 11.806096 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 4.534 | 6.421156 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 3.564 | 3.290596 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 3.756 | 0.571536 |
244 rows × 9 columns
In [56]:
# fit a model on total_bill and size
features = data[['total_bill', 'size']]
tip = data['tip']
f2 = linear_model.LinearRegression(fit_intercept = False)
f2.fit(features, tip)
Out[56]:
LinearRegression(fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [57]:
# show coefficients
f2.coef_
# two coefficients, one for 'total_bill', the other for 'size'
Out[57]:
array([0.1007119 , 0.36209717])
In [58]:
# make a prediction for a table with $10 total bill and 3 people seated
f2.predict([[10,3]])
# predictions: f2 model: $2.09
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Out[58]:
array([2.09341054])
In [59]:
# previous model f
f.predict([[10]])
# f model: $1.43
C:\Users\liliz\AnacondaInstall\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Out[59]:
array([1.43731895])
In [60]:
# the f2 model was trained on 2D data, so it can only make predictions on 2D data.
In [61]:
# If there are k parameters in a linear model, you need k features.
In [62]:
# compare preditions for f and f2 side by side in table
data['prediction'] = f.predict(data[['total_bill']])
data['prediction_2d'] = f2.predict(data[['total_bill', 'size']])
In [63]:
data
Out[63]:
| total_bill | tip | sex | smoker | day | time | size | prediction | l2_loss | prediction_2d | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | 2.442005 | 5.702544 | 2.435290 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | 1.486188 | 0.166464 | 2.127653 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 3.019807 | 0.492804 | 3.202249 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 3.403571 | 2.033476 | 3.109052 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 3.534367 | 1.710864 | 3.924894 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 4.172537 | 0.012996 | 4.009958 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 3.906633 | 11.806096 | 3.461544 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 3.258402 | 6.421156 | 3.007333 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | 2.561302 | 3.290596 | 2.518880 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 2.699285 | 0.571536 | 2.615564 |
244 rows × 10 columns
In [64]:
# so which is better, f or f2? You can use loss functions and compute the mean squared error(MSE)
In [65]:
# compare MSE for f and f2
mean_squared_error(data['prediction'],data['tip'])
Out[65]:
1.1781161154513171
In [66]:
mean_squared_error(data['prediction_2d'], data['tip'])
Out[66]:
1.06482122862577
In [67]:
# predictions: 1D model: 1.178, 2D model: 1.065
# 2D model is better since it gets a lower mean squared error.
# the model gave higher-quality predictions when it had more information.
In [68]:
# Let explore what are the models acutally doing.
In [69]:
# make 3d plot of our data
px.scatter_3d(data, x = 'total_bill', y = 'size', z = 'tip')
# 3D plot: As the total bill goes up, the tip goes up; as the size goes up, the tip goes up.
In [70]:
# This code is not something we expect you to understand!
# It's just computing predictions for various bills and table sizes
table_bills, table_sizes = np.meshgrid(range(50), range(6))
tip_predictions = (0.1007119 * table_bills + 0.3621 * table_sizes)
In [71]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter3d(x = data['total_bill'], y = data['size'], z = data['tip'],
mode = 'markers', name = 'actual'))
fig.add_trace(go.Surface(x = table_bills, y = table_sizes, z = tip_predictions, name = 'predicted'))
fig.show()
# 2D plot: The plane increases with both size the the tip; the model is finding the plane of best fit
In [72]:
f.coef_
Out[72]:
array([0.1437319])
In [73]:
f2.coef_
Out[73]:
array([0.1007119 , 0.36209717])
In [74]:
# Our 1D and 2D models as equation
# 1. tip = 0.1437 * bill
# 2. tip = 0.1*bill + 0.36*size
# Even though model 2 has lower MSE, model 1 is probably a better model of reality.
# Model 2 is overfitting.
# Model needs to make sense.
In [75]:
# Using Nonnumberic Features
In [76]:
data = sns.load_dataset('tips')
data.head(5)
Out[76]:
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
In [77]:
# create a copy of the dataset that only has 3 features in order to keep things simple
three_features = ['total_bill','size','day']
three_feature_data = pd.DataFrame(data[three_features])
three_feature_data.iloc[[193,90,25,26,190],:]
Out[77]:
| total_bill | size | day | |
|---|---|---|---|
| 193 | 15.48 | 2 | Thur |
| 90 | 28.97 | 2 | Fri |
| 25 | 17.81 | 4 | Sat |
| 26 | 13.37 | 2 | Sat |
| 190 | 15.69 | 2 | Sun |
In [78]:
# Let's create 'dummies' that represent whether it is thursday, friday, saturday, or sunday
dummies = pd.get_dummies(three_feature_data['day'])
dummies.iloc[[193,90,25, 26,190], :]
Out[78]:
| Thur | Fri | Sat | Sun | |
|---|---|---|---|---|
| 193 | True | False | False | False |
| 90 | False | True | False | False |
| 25 | False | False | True | False |
| 26 | False | False | True | False |
| 190 | False | False | False | True |
In [79]:
# concatenate the dummies table with three_feature_data
# pd.concat adds rows or columns to a data frame
data_w_dummies = pd.concat([three_feature_data, dummies], axis = 1)
data_w_dummies.iloc[[193,90,25,26,190],:]
Out[79]:
| total_bill | size | day | Thur | Fri | Sat | Sun | |
|---|---|---|---|---|---|---|---|
| 193 | 15.48 | 2 | Thur | True | False | False | False |
| 90 | 28.97 | 2 | Fri | False | True | False | False |
| 25 | 17.81 | 4 | Sat | False | False | True | False |
| 26 | 13.37 | 2 | Sat | False | False | True | False |
| 190 | 15.69 | 2 | Sun | False | False | False | True |
In [80]:
# The code below will crash since data_w_dummies includes a non-numeric feature
f_with_day = linear_model.LinearRegreassion(fit_intercept=False)
f_with_day.fit(data_w_dummies, tip)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[80], line 2 1 # The code below will crash since data_w_dummies includes a non-numeric feature ----> 2 f_with_day = linear_model.LinearRegreassion(fit_intercept=False) 3 f_with_day.fit(data_w_dummies, tip) AttributeError: module 'sklearn.linear_model' has no attribute 'LinearRegreassion'
In [81]:
# drop the non-numeric column
del data_w_dummies['day']
data_w_dummies.head(5)
Out[81]:
| total_bill | size | Thur | Fri | Sat | Sun | |
|---|---|---|---|---|---|---|
| 0 | 16.99 | 2 | False | False | False | True |
| 1 | 10.34 | 3 | False | False | False | True |
| 2 | 21.01 | 3 | False | False | False | True |
| 3 | 23.68 | 2 | False | False | False | True |
| 4 | 24.59 | 4 | False | False | False | True |
In [82]:
# fit the model
f_with_day = linear_model.LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies, tip)
Out[82]:
LinearRegression(fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [83]:
# The model has been trined on numeric and nonnumeric features
In [84]:
f_with_day.coef_
Out[84]:
array([0.09299361, 0.18713231, 0.66829361, 0.74578683, 0.62112858,
0.73228865])
In [85]:
# Tip prediction
# Size: 3
# Total bill: $50
# Day: Thursday
In [86]:
# Computing the value using our model
# Thursday: $5.88
# Saturday: $5.83
# Sunday: $5.94
In [87]:
# What has the AI learned?
In [88]:
# Evalulating the 6D model:
# It does slightly better than the 2D and 1D models.
# Dule to overfitting, it might perform worse on new observations
In [89]:
px.scatter(data, x='total_bill', y= 'tip', color = 'day', trendline = 'ols')
In [ ]: